import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from numpy import loadtxt

# 导入数据
with open(r'E:\\建模\\华为杯\\异常\\1.异常.csv', 'r', encoding='UTF-8') as raw_data:
# with open(r'E:\\建模\\华为杯\\2021年中国研究生数学建模竞赛赛题\\E\\code\\winedata.csv', 'r', encoding='UTF-8') as raw_data:
    wine_data = loadtxt(raw_data, delimiter=',')

raw_data.close()

print(wine_data.shape)

data = np.array(wine_data[:,0:3])


target = np.array(wine_data[:,4:5])

# 输出data,target矩阵的形状
# 定义数据打乱函数
def Data_Order(data,target):

    # 生成一个permutation数组，这里面是含有打乱顺序的准侧，即表示当前数据集是按照某种方式打乱
    permutation = list(np.random.permutation(target.shape[0]))

    # 以premutation为准侧分别对data,target数据集进行打乱
    data_rd = data[permutation, :]

    target_rd = target[permutation]

    return (data_rd,target_rd)

# 接受打乱后的矩阵
data,target = Data_Order(data,target)

# 计算训练集和测试集的数目
train_num = int(7 / 10 * target.shape[0])

test_num = target.shape[0] - train_num

print(test_num)
#
print(train_num)

# 对打乱后的矩阵进行划分
def Data_split(data,target):

    # 将打乱的数据集划分为训练集和测试集

    Xtrain = data[:train_num]

    Ytrain = target[:train_num]

    Xtest = data[train_num:]

    Ytest = target[train_num:]

    return (Xtrain,Ytrain,Xtest,Ytest)

# 接受训练集和测试集
Xtrain,Ytrain,Xtest,Ytest = Data_split(data,target)

# 对数据进行归一化预处理
for i in range(Xtrain.shape[1]):

    Xtrain[:,i] = (Xtrain[:,i] - np.mean(Xtrain[:,i])) / np.std(Xtrain[:,i])

    Xtest[:,i] = (Xtest[:,i] - np.mean(Xtest[:,i])) / np.std(Xtest[:,i])

# 构建模型
def Knn_fit(One_test,Xtrain,Ytrain):

    count = 0

    list = []

    # 遍历训练集
    for i in Xtrain:
        dict = {}

        # 计算每个特征欧式距离
        distance = np.subtract(i,One_test)

        distance = distance ** 2

        distance = distance ** 0.5

        # 获取训练集的类别
        label = Ytrain[count]

        count += 1

        sum_distance = 0

        # 计算总的欧式距离
        for j in distance:

            sum_distance += j

        # 将每个类别与其和单个测试数据的总距离保存在字典中
        dict['class'] = label

        dict['distance'] = sum_distance

        list.append(dict)

    # 按照字典中某个值对列表进行排序
    list = sorted(list,key=lambda item:item['distance'])
    # 返回升序列表
    return list

# 对单一测试数据进行结果预测
def Knn_predict(One_predictor,k = 3):
    # 字典
    dicts = {1.0:0 , 2.0:0 , 3.0:0}

    # 取返回有序列表的前k项
    for dic in Knn_fit(One_predictor,Xtrain,Ytrain)[:k]:
        # 统计字典中每一类的个数
        if dic['class'] in np.array(dicts):

            dicts[dic['class']] += 1

    # 对字典中的值进行升序排序
    Order_dicts = sorted(dicts.items(),key=lambda item:item[1])

    # 返回字典中值最大的键
    return (Order_dicts[2][0])


# 计算精确度函数
def Knn_Score(neighbor = 3):


    sum = 0

    # 遍历所以测试集，计算预测结果与实际结果的相符情况
    for test,result in zip(Xtest,Ytest):


        if result ==  Knn_predict(test,neighbor):

            sum += 1


    print(sum / test_num)



neighbors = 3

Knn_Score(neighbors)

# 调用sklearn内置模块进行测试
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(neighbors)

knn.fit(Xtrain,Ytrain)

print(knn.score(Xtest,Ytest))
print(knn.score(Xtrain,Ytrain)


